Update of explore-JULES-ES-1p0.Rmd that uses the new packages and functions
We build and test gaussian process emulators, perform a sensitivity analyis, and constrain the input space of Jules.
Andy thinks there might be mileage in analysing the atmospheric growth, which is here. /home/h01/hadaw/Research/ES_PPE/Oct20
At the moment, this vignette is hampered by the fact that emulators are failing on a few of the outputs which represent change over the historical perid. The emulator is fine for predicting absolute values in the modern period.
Andy would like to see timeseries of: cVeg, cSoil and nbp, npp in GtC and GtC/yr.
Load libraries, functions and data.
# Load helper functions
knitr::opts_chunk$set(fig.path = "figs/", echo = TRUE, message = FALSE, warnings = FALSE)
# load some helper functions
source('~/brazilCSSP/code/brazil_cssp/per_pft.R') # eventually, move the relevant functions
source('explore-JULES-ES-1p0_PPE_functions.R')
# Load packages
library(RColorBrewer)
library(fields)
library(MASS)
library(DiceKriging)
library(DiceEval)
library(ncdf4)
library(ncdf4.helpers)
library(foreach)
library(emtools)
library(imptools)
library(viztools)
library(julesR)
# Some pallete options
yg = brewer.pal(9, "YlGn")
ryb = brewer.pal(11, "RdYlBu")
byr = rev(ryb)
rb = brewer.pal(11, "RdBu")
br = rev(rb)
blues = brewer.pal(9, 'Blues')
cbPal <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
ysec = 60*60*24*365
years <- 1850:2013
ensloc <- '/project/carbon_ppe/JULES-ES-1p0_PPE/'
ensmember <- 'P0000/'
subdir <- 'stats/'
floc <- paste0(ensloc,ensmember,subdir)
fn <- 'JULES-ES-1p0_P0000_Annual_global.nc'
# test file
nc <- nc_open(paste0(floc,fn))
# What variables are in the file?
varlist <- nc.get.variable.list(nc)
extractTimeseries <- function(nc, variable){
dat <- ncvar_get(nc, variable)
out <- dat
out
}
makeTimeseriesEnsemble <- function(variable, nens = 499, nts = 164, cn = 1850:2013){
# nens is number of ensemble members
# nts length of timeseries
# cn is colnames()
datmat <- matrix(NA, nrow = nens, ncol = nts)
colnames(datmat) <- cn
enslist <- paste("P", formatC(0:(nens-1), width=4, flag="0"), sep="")
floc <- paste0(ensloc,ensmember,subdir)
for(i in 1:nens){
vec <- rep(NA,nts)
ensmember <- enslist[i]
fn <- paste0(ensloc,ensmember,'/stats/','JULES-ES-1p0_',ensmember,'_Annual_global.nc')
try(nc <- nc_open(paste0(fn)))
try(dat <- extractTimeseries(nc, variable))
datmat[i, ] <- dat
nc_close(nc)
}
datmat
}
par(mfrow= c(2,2))
ylim = range(c(nbp_ens[,1], nbp_ens[,164]))
matplot(years, t(nbp_ens), type = 'l', lty = 'solid',ylim = ylim, col = cbPal, ylab = 'NBP sum', main = 'NBP')
ylim = range(c(npp_ens[,1], npp_ens[,164]))
matplot(years, t(npp_ens), type = 'l', lty = 'solid',ylim = ylim, col = cbPal, ylab = 'NPP sum', main = 'NPP')
ylim = range(c(cSoil_ens[,1], cSoil_ens[,164]))
matplot(years, t(cSoil_ens), type = 'l', lty = 'solid',ylim = ylim, col = cbPal, ylab = 'Soil carbon sum', main = 'Soil Carbon')
ylim = range(c(cVeg_ens[,1], cVeg_ens[,164]))
matplot(years, t(cVeg_ens), type = 'l', lty = 'solid',ylim = ylim, col = cbPal, ylab = 'Veg carbon sum', main = 'Vegetation Carbon')
npp_ens_anom <- anomalizeTSmatrix(npp_ens, 1:20)
nbp_ens_anom <- anomalizeTSmatrix(nbp_ens, 1:20)
cSoil_ens_anom <- anomalizeTSmatrix(cSoil_ens, 1:20)
cVeg_ens_anom <- anomalizeTSmatrix(cVeg_ens, 1:20)
par(mfrow= c(2,2))
ylim = range(c(nbp_ens_anom[,1], nbp_ens_anom[,164]))
matplot(years, t(nbp_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = cbPal, ylab = 'NBP sum Anomaly', main = 'NBP Anomaly')
ylim = range(c(npp_ens_anom[,1], npp_ens_anom[,164]))
matplot(years, t(npp_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = cbPal, ylab = 'NPP sum Anomaly', main = 'NPP Anomaly')
ylim = range(c(cSoil_ens_anom[,1], cSoil_ens_anom[,164]))
matplot(years, t(cSoil_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = cbPal, ylab = 'Soil carbon sum anomaly', main = 'Soil Carbon Anomaly')
ylim = range(c(cVeg_ens_anom[,1], cVeg_ens_anom[,164]))
matplot(years, t(cVeg_ens_anom), type = 'l', lty = 'solid',ylim = ylim, col = cbPal, ylab = 'Veg carbon sum anomaly', main = 'Vegetation Carbon Anomaly')
# for each of the variables in the file, average the last 20 years as the "modern" value,
# and then place in a matrix
modernValue <- function(nc, variable, ix){
# A basic function to read a variable and
# take the mean of the timeseries at locations ix
dat <- ncvar_get(nc, variable)
out <- mean(dat[ix])
out
}
#144:164 is the 1993:2013
modernValue(nc = nc, variable = "npp_nlim_lnd_mean", ix = 144:164)
# apply to the test file to check it works
vec <- sapply(varlist, FUN = modernValue, nc = nc, ix = 144:164)
# Generate ensemble numbers, mean of the last 20 years of the timeseries (1994-2013)
if (file.exists("ensemble.rdata")) {
load("ensemble.rdata")
} else {
nens = 499
datmat <- matrix(nrow = nens, ncol = length(varlist))
colnames(datmat) <- varlist
enslist <- paste("P", formatC(0:(nens-1), width=4, flag="0"), sep="")
floc <- paste0(ensloc,ensmember,subdir)
for(i in 1:nens){
vec <- rep(NA, length(varlist))
ensmember <- enslist[i]
fn <- paste0(ensloc,ensmember,'/stats/','JULES-ES-1p0_',ensmember,'_Annual_global.nc')
#print(fn)
try(nc <- nc_open(paste0(fn)))
try(vec <- sapply(varlist, FUN = modernValue, nc = nc, ix = 144:164))
datmat[i, ] <- vec
nc_close(nc)
}
save(nens, datmat,enslist,floc, file ="ensemble.rdata")
}
For each ensemble member and each variable, calculate the change from the 20 years at the start of the run, to the twenty years at the end of the run.
tsAnomaly <- function(nc, variable, startix = 1:20, endix = 144:164){
# A basic function to read a variable and calculate the anomaly at the end of the run
dat <- ncvar_get(nc, variable)
endMean <- mean(dat[endix])
startMean <- mean(dat[startix])
out <- endMean - startMean
out
}
tsAnomaly(nc = nc, variable = "npp_nlim_lnd_mean")
## [1] 2.410932e-09
# Generate ensemble mean of the last 20 years of the timeseries (1994-2013)
if (file.exists("anomaly_ensemble.rdata")) {
load("anomaly_ensemble.rdata")
} else {
nens = 499
datmatAnom <- matrix(nrow = nens, ncol = length(varlist))
colnames(datmatAnom) <- varlist
enslist <- paste("P", formatC(0:(nens-1), width=4, flag="0"), sep="")
floc <- paste0(ensloc,ensmember,subdir)
for(i in 1:nens){
vec <- rep(NA, length(varlist))
ensmember <- enslist[i]
fn <- paste0(ensloc,ensmember,'/stats/','JULES-ES-1p0_',ensmember,'_Annual_global.nc')
#print(fn)
try(nc <- nc_open(paste0(fn)))
try(vec <- sapply(varlist, FUN = tsAnomaly, nc = nc))
datmatAnom[i, ] <- vec
nc_close(nc)
}
save(nens, datmatAnom, enslist,floc, file ="anomaly_ensemble.rdata")
}
Initial clean of data set, removing variables that don’t change, and removing NAs (models that didn’t run).
Y_nlevel0_ix <- which(is.na(datmat[,'year']))
YAnom_nlevel0_ix <- which(is.na(datmatAnom[,'year']))
# This should be true to proceed, or we'll have to start excluding the combined set.
identical(Y_nlevel0_ix, YAnom_nlevel0_ix)
## [1] TRUE
# Y is the whole data set
Y <- datmat
# Y_level0 is the 'cleaned' data set, truncated to variables that change, and removing NAs
Y_level0 <- datmat[-Y_nlevel0_ix, -c(2,30,31)]
Y_nlevel0 <- datmat[Y_nlevel0_ix, -c(2, 30, 31)]
# Y is the whole data set
YAnom <- datmatAnom
# Y.level0 is the 'cleaned' data set, truncated to variables that change, and removing NAs
YAnom_level0 <- datmatAnom[-YAnom_nlevel0_ix, -c(2,30,31)]
YAnom_nlevel0 <- datmatAnom[YAnom_nlevel0_ix, -c(2,30,31)]
# load the original design and input space, normalize to [0-1]
# Load up the data
lhs_i = read.table('~/brazilCSSP/code/brazil_cssp/analyze_u-ao732/data/ES_PPE_i/lhs_u-ao732.txt', header = TRUE)
lhs_ii = read.table('~/brazilCSSP/code/brazil_cssp/analyze_u-ao732/data/ES_PPE_ii/lhs_u-ao732a.txt', header = TRUE)
toplevel_ix = 1:499
# The raw input data is a latin hypercube
lhs = rbind(lhs_i, lhs_ii)[toplevel_ix, ]
lhs_level0 <- lhs[-Y_nlevel0_ix,]
X = normalize(lhs)
colnames(X) = colnames(lhs)
X_level0 <- X[-Y_nlevel0_ix,]
X_nlevel0 <- X[Y_nlevel0_ix,]
d = ncol(X)
# lower and higher bound on the normalised matrix for visualisation
rx = rbind(rep(0,32), rep(1,32))
(“probability of run failure” Could be an interesting project - logit transformation for probability of run failure? Some other ML like SVM?
There are clear run failure thresholds in the parameters rootd_ft_io and lai_max, and quite strong visual indications that a_wl_io and bio_hum_cn matter.
#simple way
par(oma = c(10,10,0,0))
#par(xpd = TRUE)
pairs(X_nlevel0,
xlim = c(0,1), ylim = c(0,1),
col = 'red',
gap = 0,
pch = 20,
lower.panel = NULL)
#legend('bottomright', legend = paste(1:d, colnames(lhs)), bty = 'n')
legend('left', col = 'red', pch = 20, legend = 'test')
# plot failures over everything else
#X.all <- rbind(X.level0, X.nlevel0)
#colvec <- rep('grey', nrow(X))
#colvec[(nrow(X.level0)+1):nrow(X.all)] <- 'red'
#pairs(X.all, xlim = c(0,1), ylim = c(0,1),col = colvec, gap = 0, lower.panel = NULL, pch = 20, cex = 0.5)
First, use mean NPP as an example. How does NPP respond to each parameter? NAs are removed, but zero values are still included.
p <- ncol(X_level0)
y_level0 <- Y_level0[,'npp_nlim_lnd_sum']
par(mfrow = c(5,7), mar = c(3,1,3,1), oma = c(1,1,5,1))
for(i in 1:p){
plot(X_level0[,i], y_level0, xlab = '', ylab = '', main = colnames(X_level0)[i])
}
mtext('NPP', outer = TRUE, side = 3, cex = 2, line = 2)
yanom_level0 <- YAnom_level0[,'npp_nlim_lnd_sum']
par(mfrow = c(5,7), mar = c(3,1,3,1), oma = c(1,1,5,1))
for(i in 1:p){
plot(X_level0[,i], yanom_level0, xlab = '', ylab = '', main = colnames(X_level0)[i])
}
mtext('NPP change over time', outer = TRUE, side = 3, cex = 2, line = 2)
Some outputs (e.g fLuc, fHarvest) have an almost perfect 1:1 relationship between modern value and change, some (nbp, npp, treeFrac) quite or moderately strong, and some (csoil, cveg) very weak or non-existant.
par(mfrow = c(4,7), mar = c(3,1,3,1), oma = c(4,5,1,1))
pdash <- ncol(Y_level0)
for(i in 1:pdash){
y_level0 <- Y_level0[,i]
yanom_level0 <- YAnom_level0[,i]
plot(y_level0, yanom_level0, xlab = '', ylab = '', main = colnames(Y_level0)[i])
}
mtext(text = 'Modern value', side = 1, line = 2, outer = TRUE, cex = 2)
mtext(text = 'Change', side = 2, line = 2, outer = TRUE, cex = 2)
It appears that this ensemble is less “clear cut” in having an output that clearly distinguishes between “failed” (or close to it), and “not failed”.
Having said that, having an F0 over a threshold seems to kill the carbon cycle, as before. Here, we’ve set a threshold of 0.9 (on the normalised scale) for F0, and we remove members of the ensemble with a larger F0 than that when we build emulators.
par(mfrow = c(2,1))
plot(lhs$f0_io, datmat[, 'npp_nlim_lnd_sum'], main = 'Multiplication factor', xlab = 'f0_io', ylab = 'NPP sum')
plot(X_level0[,'f0_io'], Y_level0[, 'npp_nlim_lnd_sum'], xlab = 'f0_io', main = 'Normalized', ylab = 'NPP sum')
abline(v = 0.9)
The level 1 constraint removes any input with F0 greater than 0.9 (normalised), which removes many of the zero-carbon-cycle members up front. There are 424 ensmble members remaining.
This does constraint sequentially, which may not be a good idea.
level1_ix <- which(X_level0[, 'f0_io'] < 0.9)
X_level1 <- X_level0[level1_ix, ]
Y_level1 <- Y_level0[level1_ix, ]
YAnom_level1 <- YAnom_level0[level1_ix, ]
y_level1 <- Y_level1[, 'npp_nlim_lnd_sum']
em_npp_level0 <- km(~., design = X_level0, response = y_level0)
em_npp_level1 <- km(~., design = X_level1, response = y_level1)
# Plot the regular km emulator. Doesn’t look great.
plot(em_npp_level0)
plot(em_npp_level1)
loo_npp_level0 <- leaveOneOut.km(em_npp_level0, type = 'UK', trend.reestim = TRUE)
loo_npp_level1 <- leaveOneOut.km(em_npp_level1, type = 'UK', trend.reestim = TRUE)
RMSE ( loo_npp_level0$mean, y_level0)
## [1] 10.80969
RMSE ( loo_npp_level1$mean, y_level1)
## [1] 369555
# How about MAE over the range
prop_mae <- function(Y, Ypred){
# mean absolute error as a proportion of the range of output
absdiff <- abs(diff(range(Y)))
mae <- MAE(Y, Ypred)
propmae <- (mae / absdiff) * 100
propmae
}
prop_mae(y_level0, loo_npp_level0$mean)
## [1] 8.352962
prop_mae(y_level1, loo_npp_level1$mean)
## [1] 5.220906
# Given a km model, produce some error statistics
kmLooStats <- function(km, type = 'UK'){
loo <- leaveOneOut.km(km, type = type, trend.reestim = TRUE)
mae <- MAE(Y = km@y, Ypred = loo$mean)
rmse <- RMSE(Y = km@y, Ypred = loo$mean)
maxerr <- max(loo$mean)
absdiff <- abs(diff(range(km@y)))
pmae <- (mae / absdiff) * 100
return(list(loo = loo, mae = mae, pmae = pmae, maxerr = maxerr))
}
errstats_npp_level0 <- kmLooStats(km = em_npp_level0)
errstats_npp_level1 <- kmLooStats(km = em_npp_level1)
# proportional mean absolute error
errstats_npp_level0$pmae
## [1] 8.352962
errstats_npp_level1$pmae
## [1] 5.220906
# DiceEval is useful but slow/awkward
# DE_em_npp_level0 <- modelFit(X = X_level0, Y = y_level0, type = 'Kriging', formula = ~. )
# DE_em_npp_level1 <- modelFit(X = X_level1, Y = y_level1, type = 'Kriging', formula = ~. )
# npp_em_cv_level0 <- crossValidation(DE_em_npp_level0, 5)
# npp_em_cv_level1 <- crossValidation(DE_em_npp_level1, 5)
The problem with doing constraint first is that you end up with a non-hypercube shaped input space (corners have been knocked off by constraint), which is not ideal. We might therefore want different sensitivity measures for pre- and post-constrained ensemble.
#sensvar needs to go into emtools with oaat_design
sensvar <- function(oaat_pred, n, d){
# Calculate variance as a global sensitivity meansure
out = rep(NA,d)
for(i in 1:d){
ix = seq(from = ((i*n) - (n-1)), to = (i*n), by = 1)
out[i] = var(oaat_pred$mean[ix])
}
out
}
twoStep_glmnet <- function(X, y, nugget=NULL, nuggetEstim=FALSE, noiseVar=NULL, seed=NULL, trace=FALSE, maxit=100,
REPORT=10, factr=1e7, pgtol=0.0, parinit=NULL, popsize=100){
# Use lasso to reduce input dimension of emulator before
# building.
control_list = list(trace=trace, maxit=maxit, REPORT=REPORT, factr=factr, pgtol=pgtol, pop.size=popsize)
xvars = colnames(X)
data = data.frame(response=y, x=X)
colnames(data) <- c("response", xvars)
nval = length(y)
# fit a lasso by cross validation
library(glmnet)
fit_glmnet_cv = cv.glmnet(x=X,y=y)
# The labels of the retained coefficients are here
# (retains intercept at index zero)
coef_i = (coef(fit_glmnet_cv, s = "lambda.1se"))@i
labs = labels(coef(fit_glmnet_cv, s = "lambda.1se"))[[1]]
labs = labs[-1] # remove intercept
glmnet_retained = labs[coef_i]
start_form = as.formula(paste("~ ", paste(glmnet_retained , collapse= "+")))
m = km(start_form, design=X, response=y, nugget=nugget, parinit=parinit,
nugget.estim=nuggetEstim, noise.var=noiseVar, control=control_list)
return(list(x=X, y=y, nugget=nugget, nuggetEstim=nuggetEstim,
noiseVar=noiseVar, emulator=m, seed=seed, coefs=m@covariance@range.val,
trends=m@trend.coef, meanTerms=all.vars(start_form), fit_glmnet_cv=fit_glmnet_cv))
}
twoStep_sens <- function(X, y, n=21, predtype = 'UK', nugget=NULL, nuggetEstim=FALSE, noiseVar=NULL, seed=NULL, trace=FALSE, maxit=100,
REPORT=10, factr=1e7, pgtol=0.0, parinit=NULL, popsize=100){
# Sensitivity analysis with twoStep emulator.
# Calculates the variance of the output varied one at a time across each input.
d = ncol(X)
X_norm <- normalize(X)
X_oaat <- oaat_design(X_norm, n, med = TRUE)
colnames(X_oaat) = colnames(X)
twoStep_em = twoStep_glmnet(X=X, y=y, nugget=nugget, nuggetEstim=nuggetEstim, noiseVar=noiseVar,
seed=seed, trace=trace, maxit=maxit,
REPORT=REPORT, factr=factr, pgtol=pgtol,
parinit=parinit, popsize=popsize)
oaat_pred = predict(twoStep_em$emulator, newdata = X_oaat, type = predtype)
sens = sensvar(oaat_pred = oaat_pred, n=n, d=d)
out = sens
out
}
if (file.exists("oat_twostep.rdata")) {
load("oat_twostep.rdata")
} else {
oat_test <- twoStep_sens(X = X_level1, y = y_level1)
save(oat_test, file = "oat_twostep.rdata")
}
y_names_sum <- c('nbp_lnd_sum', 'fLuc_lnd_sum', 'npp_nlim_lnd_sum', 'cSoil_lnd_sum',
'cVeg_lnd_sum', 'landCoverFrac_lnd_sum', 'fHarvest_lnd_sum',
'lai_lnd_sum', 'rh_lnd_sum', 'treeFrac_lnd_sum', 'c3PftFrac_lnd_sum',
'c4PftFrac_lnd_sum', 'shrubFrac_lnd_sum', 'baresoilFrac_lnd_sum')
if (file.exists("oaat_Y.rdata")) {
load("oaat_Y.rdata")
} else {
oat_var_sensmat_Y <- matrix(NA, nrow = length(y_names_sum), ncol = ncol(X_level1))
for(i in 1:length(y_names_sum)){
yname <- y_names_sum[i]
y <- Y_level1[, yname]
oat <- twoStep_sens(X = X_level1, y = y)
oat_var_sensmat_Y[i, ] <- oat
}
save(y_names_sum, oat_var_sensmat_Y, file = "oaat_Y.rdata")
}
It looks as though bwl_io is very influential across a number of variables, even though it didn’t appear that interesting in the parginal plots. Why is that?
# normalize the sensitivity matrix
colnames(oat_var_sensmat_Y) <- colnames(X_level1)
rownames(oat_var_sensmat_Y) <- y_names_sum
#test <- normalize(t(oat.var.sensmat))
#image(test)
#par()
heatmap(oat_var_sensmat_Y, Rowv = NA, Colv = NA, mar = c(10,10), scale = 'row')
heatmap(oat_var_sensmat_Y, mar = c(10,10), scale = 'row')
normsens_Y <- normalize(t(oat_var_sensmat_Y))
par(mar = c(12,12,5,2))
image(normsens_Y, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance across level 1 ensemble', side = 3, adj = 0, line = 2, cex = 1.8)
A closer look at bwl_io now that the impact of f0_io has been removed shows a large number of “zero” NPP when bwl_io is at low values, which could well be the source of apparent sensitivity.
Take only higher values of bwl_io for the next round of constraints.
p <- ncol(X_level1)
y_level1 <- Y_level1[,'npp_nlim_lnd_sum']
par(mfrow = c(5,7), mar = c(3,1,3,1))
for(i in 1:p){
plot(X_level1[,i], y_level1, xlab = '', ylab = '', main = colnames(X_level1)[i])
}
level1a_ix <- which(X_level0[, 'f0_io'] < 0.9 & X_level0[, 'b_wl_io'] > 0.15 )
X_level1a <- X_level0[level1a_ix, ]
Y_level1a <- Y_level0[level1a_ix,]
YAnom_level1a <- YAnom_level0[level1a_ix, ]
y_level1a <- Y_level1a[, 'npp_nlim_lnd_sum']
em_level1a <- km(~., design = X_level1a, response = y_level1a)
# Plot the regular km emulator.
plot(em_level1a)
if (file.exists("oaat_level1a_Y.rdata")) {
load("oaat_level1a_Y.rdata")
} else {
oat_var_sensmat_level1a_Y <- matrix(NA, nrow = length(y_names_sum), ncol = ncol(X_level1a))
for(i in 1:length(y_names_sum)){
yname <- y_names_sum[i]
y <- Y_level1a[, yname]
oat <- twoStep_sens(X = X_level1a, y = y)
oat_var_sensmat_level1a_Y[i, ] <- oat
}
save(y_names_sum, oat_var_sensmat_level1a_Y, file = "oaat_level1a_Y.rdata")
}
p <- ncol(X_level1a)
y_level1a <- Y_level1a[,'npp_nlim_lnd_sum']
par(mfrow = c(5,7), mar = c(3,1,3,1))
for(i in 1:p){
plot(X_level1a[,i], y_level1a, xlab = '', ylab = '', main = colnames(X_level1a)[i], xlim = c(0,1))
}
# normalize the sensitivity matrix
colnames(oat_var_sensmat_level1a_Y) <- colnames(X_level1a)
rownames(oat_var_sensmat_level1a_Y) <- y_names_sum
#test <- normalize(t(oat.var.sensmat))
#image(test)
#par()
heatmap(oat_var_sensmat_level1a_Y, Rowv = NA, Colv = NA, mar = c(10,10), scale = 'row')
heatmap(oat_var_sensmat_level1a_Y, mar = c(10,10), scale = 'row')
normsens_level1a_Y <- normalize(t(oat_var_sensmat_level1a_Y))
par(mar = c(12,12,5,2))
image(normsens_level1a_Y, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1a), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance across level 1a ensemble', side = 3, adj = 0, line = 2, cex = 1.8)
# This fails for "cVeg_lnd_sum"
# The result will be that some of the columns are repeated.
if (file.exists("oaat_YAnom.rdata")) {
load("oaat_YAnom.rdata")
} else {
oat_var_sensmat_YAnom <- matrix(NA, nrow = length(y_names_sum), ncol = ncol(X_level1))
for(i in 1:length(y_names_sum)){
yname <- y_names_sum[i]
y <- YAnom_level1[, yname]
try(oat <- twoStep_sens(X = X_level1, y = y))
oat_var_sensmat_YAnom[i, ] <- oat
}
save(y_names_sum, oat_var_sensmat_YAnom, file = "oaat_YAnom.rdata")
}
# normalize the sensitivity matrix
colnames(oat_var_sensmat_YAnom) <- colnames(X_level1)
rownames(oat_var_sensmat_YAnom) <- y_names_sum
#test <- normalize(t(oat.var.sensmat))
#image(test)
#par()
heatmap(oat_var_sensmat_YAnom, Rowv = NA, Colv = NA, mar = c(10,10), scale = 'row')
heatmap(oat_var_sensmat_YAnom, mar = c(10,6), scale = 'row')
normsens_YAnom <- normalize(t(oat_var_sensmat_YAnom))
par(mar = c(12,12,5,2))
image(normsens_YAnom, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance of historical change across level 1 ensemble', side = 3, adj = 0, line = 2, cex = 1.8)
It looks as though both the absolute value and the change over time are controlled by the same parameters.
par(mfrow = c(2,1), mar = c(12,12,5,2))
image(normsens_Y, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance of historical change across level 1 ensemble', side = 3, adj = 0, line = 2, cex = 1.8)
image(normsens_YAnom, axes = FALSE, col = blues)
axis(1, at = seq(from = 0, to = 1, length.out = p), labels = colnames(X_level1), las = 2)
axis(2, at = seq(from = 0, to = 1, length.out = length(y_names_sum)), labels = y_names_sum, las = 1)
mtext('One-at-a-time sensitivity, variance of historical change across level 1 ensemble', side = 3, adj = 0, line = 2, cex = 1.8)
The emulators appear to be at least capturing the broad response for all of the output variables.
First, plot the straight kriging emulators
if (file.exists("km_emulators_Y.rdata")) {
load("km_emulators_Y.rdata")
} else {
emlist_km_Y <- vector(mode = 'list', length = length(y_names_sum))
for(i in 1:length(y_names_sum)){
yname <- y_names_sum[i]
y <- Y_level1[, yname]
em <- km(~., design = X_level1, response = y)
emlist_km_Y[[i]] <- em
}
loolist_km_Y <- vector(mode = 'list', length = length(y_names_sum))
for(i in 1:length(y_names_sum)){
loo <- leaveOneOut.km(model = emlist_km_Y[[i]], type = 'UK', trend.reestim = TRUE)
loolist_km_Y[[i]] <- loo
}
save(emlist_km_Y,loolist_km_Y, file = "km_emulators_Y.rdata")
}
loostats_km_Y <- vector(mode = 'list', length = length(y_names_sum))
for(i in 1:length(emlist_km_Y)){
loostats <- kmLooStats(emlist_km_Y[[i]])
loostats_km_Y[[i]] <- loostats
print(loostats$pmae)
}
## [1] 5.433077
## [1] 5.142409
## [1] 5.2209
## [1] 4.193005
## [1] 4.219396
## [1] 4.531841
## [1] 5.629425
## [1] 8.565018
## [1] 5.224583
## [1] 9.223968
## [1] 7.089
## [1] 7.78196
## [1] 6.900971
## [1] 6.929568
par(mfrow = c(4,4), mar = c(3,4,2,2), oma = c(4,4,4,0.1))
for(i in 1:length(loolist_km_Y)){
y <- Y_level1[, y_names_sum[i]]
loo <- loolist_km_Y[[i]]
ylim <- range(c(loo$mean - (2*loo$sd), loo$mean + (2*loo$sd)) )
plot(y, loo$mean, xlab = '', ylab = '', main = y_names_sum[i] , ylim = ylim, col = makeTransparent('black', 70),
pch = 19)
segments(x0 = y, y0 = loo$mean - (2*loo$sd) , x1 = y , y1 = loo$mean + (2*loo$sd), col = makeTransparent('black', 70))
abline(0,1)
legend('bottomright',legend = paste('mae =',round(loostats_km_Y[[i]]$pmae,2),'%') , bty = 'n')
}
mtext('Actual', side = 1, line = 1, outer = TRUE, cex = 2 )
mtext('Predicted', side = 2, line = 0, outer = TRUE, cex = 2)
mtext('Modern Value (Y)', side = 3, line = 0, outer = TRUE, cex = 2)
if (file.exists("km_emulators_YAnom.rdata")) {
load("km_emulators_YAnom.rdata")
} else {
emlist_km_YAnom <- vector(mode = 'list', length = length(y_names_sum))
for(i in 1:length(y_names_sum)){
yname <- y_names_sum[i]
y <- YAnom_level1[, yname]
em <- km(~., design = X_level1, response = y)
emlist_km_YAnom[[i]] <- em
}
loolist_km_YAnom <- vector(mode = 'list', length = length(y_names_sum))
for(i in 1:length(y_names_sum)){
loo <- leaveOneOut.km(model = emlist_km_YAnom[[i]], type = 'UK', trend.reestim = TRUE)
loolist_km_YAnom[[i]] <- loo
}
save(emlist_km_YAnom,loolist_km_YAnom, file = "km_emulators_YAnom.rdata")
}
par(mfrow = c(4,4), mar = c(3,4,2,2), oma = c(4,4,4,0.1))
for(i in 1:length(loolist_km_YAnom)){
y <- YAnom_level1[, y_names_sum[i]]
loo <- loolist_km_YAnom[[i]]
ylim <- range(c(loo$mean - (2*loo$sd), loo$mean + (2*loo$sd)) )
plot(y, loo$mean, xlab = '', ylab = '', main = y_names_sum[i] , ylim = ylim, col = makeTransparent('black', 70),
pch = 19)
segments(x0 = y, y0 = loo$mean - (2*loo$sd) , x1 = y , y1 = loo$mean + (2*loo$sd), col = makeTransparent('black', 70))
abline(0,1)
}
mtext('Actual', side = 1, line = 1, outer = TRUE, cex = 2 )
mtext('Predicted', side = 2, line = 0, outer = TRUE, cex = 2)
mtext('Change over time (YAnom)', side = 3, line = 1, outer = TRUE, cex = 2)
Next, plot the twostep glmnet/km emulators
# Twostep glmnet emulators
if (file.exists("ts_emulators_Y.rdata")) {
load("ts_emulators_Y.rdata")
} else {
emlist_twoStep_glmnet_Y <- vector(mode = 'list', length = length(y_names_sum))
for(i in 1:length(y_names_sum)){
yname <- y_names_sum[i]
y <- Y_level1[, yname]
em <- twoStep_glmnet(X = X_level1, y = y)
emlist_twoStep_glmnet_Y[[i]] <- em
}
loolist_twoStep_glmnet_Y <- vector(mode = 'list', length = length(y_names_sum))
for(i in 1:length(y_names_sum)){
loo <- leaveOneOut.km(model = emlist_twoStep_glmnet_Y[[i]]$emulator, type = 'UK', trend.reestim = TRUE)
loolist_twoStep_glmnet_Y[[i]] <- loo
}
save(emlist_twoStep_glmnet_Y, loolist_twoStep_glmnet_Y, file = "ts_emulators_Y.rdata")
}
# Can get numerical performance using
loostats_twoStep_glmnet_Y <- vector(mode = 'list', length = length(y_names_sum))
for(i in 1:length(emlist_twoStep_glmnet_Y)){
loostats <- kmLooStats(emlist_twoStep_glmnet_Y[[i]]$emulator)
loostats_twoStep_glmnet_Y[[i]] <- loostats
print(loostats$pmae)
}
## [1] 5.410355
## [1] 4.733929
## [1] 5.215455
## [1] 4.173501
## [1] 4.03861
## [1] 4.597044
## [1] 5.501413
## [1] 8.563912
## [1] 5.213222
## [1] 11.36184
## [1] 7.260354
## [1] 7.707121
## [1] 6.725096
## [1] 6.865819
par(mfrow = c(4,4), mar = c(3,4,2,2), oma = c(4,4,4,0.1))
for(i in 1:length(loolist_twoStep_glmnet_Y)){
y <- Y_level1[, y_names_sum[i]]
loo <- loolist_twoStep_glmnet_Y[[i]]
ylim <- range(c(loo$mean - (2*loo$sd), loo$mean + (2*loo$sd)) )
plot(y, loo$mean, xlab = '', ylab = '', main = y_names_sum[i] , ylim = ylim, col = makeTransparent('black', 70),
pch = 19)
segments(x0 = y, y0 = loo$mean - (2*loo$sd) , x1 = y , y1 = loo$mean + (2*loo$sd), col = makeTransparent('black', 70))
abline(0,1)
legend('bottomright',legend = paste('mae =',round(loostats_twoStep_glmnet_Y[[i]]$pmae,2),'%') , bty = 'n')
}
mtext('Actual', side = 1, line = 1, outer = TRUE, cex = 2 )
mtext('Predicted', side = 2, line = 0, outer = TRUE, cex = 2)
mtext('Modern Value (Y)', side = 3, line = 0, outer = TRUE, cex = 2)
We use the leave-one-out Mean Absolute Error, expressed as a percentage of the range of the output across the ensemble. We find that the twostep emulatorisn’t significantly more accurate, and is indeed less accurate for tree fraction.
km_pmae <- sapply(loostats_km_Y, '[[', 'pmae')
ts_pmae <- sapply(loostats_twoStep_glmnet_Y, '[[', 'pmae')
par(mar = c(12,4,2,1), las =1 )
plot(1:length(y_names_sum), km_pmae,
ylim = c(0,15), pch = 19,
axes = FALSE, xlab = '',
ylab = 'LOO MAE (% of range)',
cex = 1.2)
points(1:length(y_names_sum),ts_pmae , col= 'red', pch = 19, cex = 1.2)
legend('topleft', c('km emulator', 'twoStep emulator'), pch = 19, col = c('black', 'red'), pt.cex = 1.2)
axis (2)
par(las = 2)
axis(1, at = 1:length(y_names_sum), labels = y_names_sum)
# Write an emulator wrapper
# Write a leave-one-out wrapper
# Create a list of emulator fits, for both km and twoStep methods.
# Use mclapply to build the lists
# use code from HDE package
# Make sure errors are handled adequately.
# Can we write it to use any emulator? (i.e km, twostep, something from another package?)
library(doParallel)
registerDoParallel(cores = detectCores() - 1)
direct_twoStep_glmnet_parallel <- function(X, Y, ...){
d <- ncol(Y)
out <- vector(mode='list', length=d)
foreach(i = 1:d) %dopar% {
y <- Y[,i]
em <- twoStep_glmnet(X=X, y = y, ...)
out[i] <- em
}
out
}
# I actually wrote code to do this createKmFitList, which isn't parallel at the moment.
# Maybe make it parallel in the next version?
library(doParallel)
registerDoParallel(cores = detectCores() - 1)
direct_twoStep_glmnet_parallel <- function(X, Y, ...){
d <- ncol(Y)
foreach(i = 1:d) %dopar% {
y <- Y[,i]
em <- twoStep_glmnet(X=X, y = y, ...)
}
}
To use History Matching, we need to specify targets for various model outputs. We treat these targets as “observations” with an uncertainty where a model run marked as “implausible” (beyond 3sd) matches the hard boundaries previously identified by A. Wiltshire as being desirable.
Choose the centre of the (implied) uniform distribution. cs_gb.target = (3000 - 750) / 2 = 1125 cv.target = (800 - 300) / 2 = 250 npp_n_gb.target = (80 - 35) / 2 = 22.5
nbp.target = 0 (gpp.target = 75) (runoff.target = 1)
(to do: visualise implausibility of design and loo emulated values of design as histogram)
#Y.target = c(cs_gb.target, cv.target, npp_n_gb.target, runoff.target, nbp.target)
ynames_const <- c('nbp_lnd_sum', 'npp_nlim_lnd_sum', 'cSoil_lnd_sum', 'cVeg_lnd_sum')
yunits_const <- c('GtC/year', 'GtC/year', 'GtC', 'GtC')
Y_const_level1a <- Y_level1a[, ynames_const]
scalevec <- c(1e12/ysec, 1e12/ysec, 1e12, 1e12)
Y_const_level1a_scaled <- sweep(Y_const_level1a, 2, STATS = scalevec, FUN = '/' )
# This is a "normalisation vector", for making the output numbers more manageable.
#cs_gb cv gpp_gb nbp npp_n_gb runoff
norm_vec = c(1e12, 1e12, 1e12/ysec , 1e12, 1e12, 1e9)
# nbp npp csoil cveg
Y_lower <- c(-10, 35, 750, 300)
Y_upper <- c(10, 80, 3000, 800)
# I'm going to set it so that + 4sd aligns approximately with the original limits
# given by Andy Wiltshire. This gives room for uncertainty from the emulator
Y_target = Y_upper - (abs(Y_upper - (Y_lower)) / 2 )# abs() to fix the problem with negative numbers
# standard deviation is derived from the limits and the central target
# (this distance is assumed to be 4 standard deviations.
Y_sd = (Y_upper - Y_target) / 4
names(Y_sd) = colnames(Y_const_level1a_scaled)
p = ncol(Y_const_level1a_scaled)
obs_sd_list = as.list(rep(0.01,p))
disc_list = as.list(rep(0,p))
disc_sd_list = as.list(Y_sd)
thres = 3
mins_aug = apply(X_level1a, 2, FUN = min)
maxes_aug =apply(X_level1a, 2, FUN = max)
# convert Y_target for ingestion into function
Y_target = matrix(Y_target, nrow = 1)
# Histogram of level 1 constraints
hcol = 'darkgrey'
lcol = 'black'
par(mfrow = c(2,2), fg = 'darkgrey', las = 1)
hist(Y_const_level1a_scaled[,'nbp_lnd_sum'], col = hcol, main = 'NBP', xlab = 'GtC/year')
polygon(x = c(-10, 100, 100, -10), y = c(0, 0, 1000, 1000),
col = makeTransparent('tomato2'),
border = makeTransparent('tomato2'))
hist(Y_const_level1a_scaled[,'npp_nlim_lnd_sum'], col = hcol , main = 'NPP', xlab = 'GtC/year')
polygon(x = c(35, 80, 80, 35), y = c(0, 0, 1000, 1000),
col = makeTransparent('tomato2'),
border = makeTransparent('tomato2'))
hist(Y_const_level1a_scaled[,'cSoil_lnd_sum'], col = hcol, main = 'Soil Carbon', xlab = 'GtC')
polygon(x = c(750, 3000, 3000, 750), y = c(0, 0, 1000, 1000),
col = makeTransparent('tomato2'),
border = makeTransparent('tomato2'))
hist(Y_const_level1a_scaled[,'cVeg_lnd_sum'], col = hcol, main = 'Vegetation Carbon', xlab = 'GtC')
polygon(x = c(300, 800, 800, 300), y = c(0, 0, 1000, 1000),
col = makeTransparent('tomato2'),
border = makeTransparent('tomato2'))
The function addNroyDesignPoints builds an emulator for each model output in Y. It compares the output of each emulator at a number of candidate desin points, and chooses a space-filling set of them that that are Not Ruled Out Yet (statistically close to the observation at Y_target).
# Final output needs to be expressed in terms of original LHS, then put back out to conf files.
# This function adds n.aug potential design points, and finds their implausibility
# score in X.nroy
wave1 = addNroyDesignPoints(X = X_level1a,
Y = Y_const_level1a_scaled,
Y_target = Y_target,
n_aug = 50000,
mins_aug = mins_aug,
maxes_aug = maxes_aug,
thres = 3,
disc_list=disc_list,
disc_sd_list = disc_sd_list,
obs_sd_list = obs_sd_list,
n_app = 500,
nreps = 500)
The function write_jules_design here simply takes the points calculated by addNroyDesignPoints and writes them to configuration files.
# this needs sorting
source('~/myRpackages/julesR/vignettes/default_jules_parameter_perturbations.R')
# Easiest way to generate a design of the right size is to have a "fac" which takes
# the names from the parameter list, and then multiplies everything by 0.5 or 2
tf <- 'l_vg_soil'
# we don't want anything that is TRUE/FALSE to be in fac
fac_init <- names(paramlist)
not_tf_ix <- which(names(paramlist)!=tf)
paramlist_trunc <-paramlist[not_tf_ix]
fac <- names(paramlist_trunc)
maxfac <-lapply(paramlist_trunc,function(x) x$max[which.max(x$max)] / x$standard[which.max(x$max)])
minfac <- lapply(paramlist_trunc,function(x) x$min[which.max(x$max)] / x$standard[which.max(x$max)])
# create a directory for the configuration files
confdir <- 'conf_files_augment_JULES-ES-1p0'
dir.create(confdir)
## Warning in dir.create(confdir): 'conf_files_augment_JULES-ES-1p0' already exists
X_mm <- wave1$X_mm
# This is the function that writes the configuration files.
write_jules_design(X_mm = X_mm, paramlist=paramlist, n = nrow(X_mm),
fac = fac, minfac = minfac, maxfac = maxfac,
tf = tf,
fnprefix = paste0(confdir,'/param-perturb-P'),
lhsfn = paste0(confdir,'/lhs_example.txt'),
stanfn = paste0(confdir,'/stanparms_example.txt'),
allstanfn = paste0(confdir,'/allstanparms_example.txt'),
rn = 12,
startnum = 500)
Check that the augmented design produces what we expect. New ensemble members should be somewhat constrained within the boundaries of the original design, if the comparison to data offers any constraint.
X_mm <- wave1$X_mm
pairs(rbind(X, X_mm), xlim = c(0,1), ylim = c(0,1), gap = 0, lower.panel = NULL,
col = c(rep('grey', nrow(X)), rep('red', nrow(X_mm))),
pch = c(rep(21, nrow(X)), rep(20, nrow(X_mm)))
)
par(xpd = NA)
legend('bottom',
legend = c('Original design', 'New points'),
col = c('grey', 'red'),
inset = 0.15,
cex = 1.5,
pch = c(21,20)
)
Visualise the predicted outputs at the NROY points of the old design, and the new suggested design.
Y_mm_list <- vector(mode ='list', length = length(wave1$fit_list))
for(i in 1:length(wave1$fit_list)){
y_mm <- predict(object=wave1$fit_list[[i]], newdata = wave1$X_mm, type = 'UK')
Y_mm_list[[i]] <- y_mm
}
Interestingly, these aren’t perfectly within the original hard boundaries set by Andy, even though I’ve set those boundaries to be the +- 4 standard deviation threholds in the History Match. I suggest this is because there is model discrepancy, and that there is considerable wriggle room induced from emulator uncertainty.
In particular, it appears that vegetation carbon is difficult to keep high, and that many NROY proposed members have a fairly low vegetation carbon. This might need a discrepancy term, or adjusting in some other way. It certainly needs exploring, and a OAAT plot might give clues as to the parameters to choose.
# Histogram of level 1 constraints
hcol = 'darkgrey'
lcol = 'black'
par(mfrow = c(2,2), fg = 'darkgrey', las = 1)
discsd <- c(disc_sd_list, recursive = TRUE)
hist(Y_const_level1a_scaled[,'nbp_lnd_sum'], col = hcol, main = 'NBP', xlab = 'GtC/year')
hist(Y_mm_list[[1]]$mean, add = TRUE, col = 'black')
polygon(x = c(-10, 100, 100, -10), y = c(0, 0, 1000, 1000),
col = makeTransparent('tomato2'),
border = makeTransparent('tomato2'))
rug(Y_target[1], col = 'black')
rug(c(Y_target[1] + 3*discsd[1],Y_target[1] - 3*discsd[1]) , col = 'red')
## Warning in rug(c(Y_target[1] + 3 * discsd[1], Y_target[1] - 3 * discsd[1]), :
## some values will be clipped
hist(Y_const_level1a_scaled[,'npp_nlim_lnd_sum'], col = hcol , main = 'NPP', xlab = 'GtC/year')
hist(Y_mm_list[[2]]$mean, add = TRUE, col = 'black')
polygon(x = c(35, 80, 80, 35), y = c(0, 0, 1000, 1000),
col = makeTransparent('tomato2'),
border = makeTransparent('tomato2'))
rug(Y_target[2], col = 'black')
rug(c(Y_target[2] + 3*discsd[2],Y_target[2] - 3*discsd[2]) , col = 'red')
hist(Y_const_level1a_scaled[,'cSoil_lnd_sum'], col = hcol, main = 'Soil Carbon', xlab = 'GtC')
hist(Y_mm_list[[3]]$mean, add = TRUE, col = 'black')
polygon(x = c(750, 3000, 3000, 750), y = c(0, 0, 1000, 1000),
col = makeTransparent('tomato2'),
border = makeTransparent('tomato2'))
rug(Y_target[3], col = 'black')
rug(c(Y_target[3] + 3*discsd[3],Y_target[3] - 3*discsd[3]) , col = 'red')
hist(Y_const_level1a_scaled[,'cVeg_lnd_sum'], col = hcol, main = 'Vegetation Carbon', xlab = 'GtC')
hist(Y_mm_list[[4]]$mean, add = TRUE, col = 'black')
polygon(x = c(300, 800, 800, 300), y = c(0, 0, 1000, 1000),
col = makeTransparent('tomato2'),
border = makeTransparent('tomato2'))
rug(Y_target[4], col = 'black')
rug(c(Y_target[4] + 3*discsd[4],Y_target[4] - 3*discsd[4]) , col = 'red')
How good are the four emulators that we’ve built? Are there biases? (there’s no real evidence of this)
par(mfrow = c(2,2))
for(i in 1:4){
hist(wave1$pred_list[[i]]$mean, main = colnames(Y_const_level1a_scaled)[i])
}
It’s obviously hard to maintain a high vegetation carbon in particular. What parameter values might you choose to do this, and what might be the trade-offs you have to make?
X_oaat_level1a <- oaat_design(X_level1a, n=21, med = TRUE)
colnames(X_oaat_level1a) = colnames(X)
y_oaat <- predict.km(wave1$fit_list[[4]], newdata = X_oaat_level1a, type = 'UK')
oaatLinePlot <- function(X_oaat, y_oaat_mean, y_oaat_sd,
n_oaat, nr, nc, linecol = 'black', shadecol = 'grey', ...){
col.transp <- adjustcolor(shadecol, alpha = 0.5)
par(mfrow = c(6,6), oma = c(0.1,0.1,3,0.1), mar = c(2,2,3,1))
for(i in 1:ncol(X_oaat)){
ix <- seq(from = ((i*n_oaat) - (n_oaat-1)), to = (i*n_oaat), by = 1)
plot(X_oaat[ix,i], y_oaat_mean[ix],
xlim = c(0,1), ylim = range(c(y_oaat_mean+(2*y_oaat_sd), y_oaat_mean-(2*y_oaat_sd))) ,
xlab = colnames(X_oaat)[i],
type= 'n',
bty = 'n', ...)
polygon(x = c(X_oaat[ix, i], rev(X_oaat[ix, i])),
y = c(y_oaat_mean[ix] - (2*y_oaat$sd[ix]), rev(y_oaat_mean[ix] + (2*y_oaat_sd[ix]))),
col = col.transp, border = col.transp)
lines(X_oaat[ix,i], y_oaat_mean[ix], xlim = c(0,1), lty = 'solid', col = linecol)
mtext(colnames(X_oaat)[i], side = 3, line = 0.5)
}
mtext('One-at-a-time sensitivity', side = 3, outer = TRUE, cex = 1.5)
}
First, what parameters affect vegetation carbon and how? How sure are we about that?
oaatLinePlot(X_oaat = X_oaat_level1a, y_oaat_mean = y_oaat$mean, y_oaat_sd = y_oaat$sd,
n_oaat = 21,nr = 6, nc = 6)
Y_oaat_const_level1a_scaled <- matrix(ncol = ncol(Y_const_level1a_scaled), nrow = nrow(X_oaat_level1a))
for(i in 1:ncol(Y_const_level1a_scaled)){
y_oaat <- predict.km(wave1$fit_list[[i]], newdata = X_oaat_level1a, type = 'UK')
Y_oaat_const_level1a_scaled[,i] <- y_oaat$mean
}
oaatLinePlotMulti <- function(X_oaat, Y_oaat, n_oaat, nr, nc, ...){
par(mfrow = c(nr,nc), oma = c(0.1,0.1,3,0.1), mar = c(2,2,3,1))
for(i in 1:ncol(X_oaat)){
ix <- seq(from = ((i*n_oaat) - (n_oaat-1)), to = (i*n_oaat), by = 1)
plot(X_oaat[ix,i], Y_oaat[ix,1],
xlim = c(0,1), ylim = c(0,1),
xlab = colnames(X_oaat)[i],
type= 'n',
bty = 'n')
for(j in 1:ncol(Y_oaat)){
lines(X_oaat_level1a[ix,i], Y_oaat[ix, j], lty = 'solid', col = j, ...)
mtext(colnames(X_oaat_level1a)[i], side = 3, line = 0.5)
}
}
}
What might be the trade-offs for a high (or accurate) vegetation carbon? are they acceptable? Plot the oaat sensitivity of the other 3 outputs we’re calibrating on.
Y_oaat_const_level1a_scaled_norm <- normalize(Y_oaat_const_level1a_scaled)
oaatLinePlotMulti(X_oaat = X_oaat_level1a, Y_oaat = Y_oaat_const_level1a_scaled_norm , n_oaat = 21, nr = 6, nc = 6,
lwd = 2)
reset()
legend('top', c('nbp', 'npp', 'csoil', 'cveg'), col = c(1,2,3,4), lty = 'solid', lwd = 2, horiz = TRUE)
# with 3 processers, using foreach parallel processing takes approximately 1/3 the time (around 20 seconds per emulator) of looping the emulator fitting directly. I'd hope this would scale with processor numbers - it'll be worth trying on SPICE.
# system.time(testloop_par <- direct_twoStep_glmnet_parallel(X=X_level1, Y = Y_level1))
direct_twoStep_glmnet <- function(X, Y, ...){
d <- ncol(Y)
out <- vector(mode='list', length=d)
for(i in 1:d){
y <- Y[,i]
em <- twoStep_glmnet(X=X, y = y, ...)
out[i] <- em
}
out
}
# system.time(testloop <- direct_twoStep_glmnet(X=X_level1, Y_level1))
# Test parallel processing against a foreach processing (and maybe mclapply?)
#system.time(testloop_par <- direct_twoStep_glmnet_parallel(X=X_level1, Y_level1))
emList <- function(X, Y){
# Create a list of objects to be emulated
# X design matrix with (nrow) instances of (ncol) inputs
# Y matrix of outputs, with one row
# for each row of X
d <- ncol(Y)
em_list <- vector(mode='list', length=d)
for(i in 1:d){
em_obj <- NULL
em_obj$X <- X
em_obj$y <- Y[, i]
em_list[[i]] <- em_obj
}
em_list
}
#test <- emlist(X_level1, Y_level1)
#mclapply(X = test, FUN = km)
# function from hde for direct prediction
direct.pred = function (form, X, Y, Xnew, ...){
# Directly applies km in parallel to predict each column of an ensemble
ens.list = emlist(X = X, Y = Y)
km.list = mclapply(ens.list, FUN = km.wrap, form = form)
pred.list = mclapply(km.list, FUN = km.pred.wrap, Xnew = as.matrix(Xnew, nrow = 1), type = "UK")
out.mean = sapply(pred.list, FUN=extract.predmean)
out.sd = sapply(pred.list, FUN=extract.predsd)
return(list(mean = out.mean, sd = out.sd))
}